# packages and data
library(tigris)
library(sf)
library(tidyverse)
library(randomcoloR)
library(leaflet)
# Ramsey county block groups
bg <- block_groups(state = "MN", county = "Ramsey") %>%
st_transform(26915)
# True census tracts
tract <- tracts(state = "MN", county = "Ramsey") %>%
st_transform(26915)
# Jolly tracts
aggregate <- st_read("code/output/prototype.shp") %>%
st_transform(26915)
As an initial picture, there are 401 block groups in Ramsey county.
leaflet(bg %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = "#faf7cf",
weight = 2,
opacity = 0.8,
color = "#b0b5bf",
fillOpacity = 0.6)
And 137 census tracts.
leaflet(tract %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = "#e89f9b",
weight = 2,
opacity = 0.8,
color = "#b0b5bf",
fillOpacity = 0.6)
As an example about how to aggregate the block to groups to a larger geography to see a decrease in the margin of error I’ll use percent white as the target variable. The spatial distribution of the total white population is mapped below.
percent_white <- read_csv("code/example_data/white_prop_est_ramsey.csv") %>% left_join(read_csv("code/example_data/race_moe_est_ramsey.csv"))
percent_white_bg <- bg %>% left_join(percent_white %>% mutate(id = as.character(id)), by = c("GEOID" = "id"))
pal <- colorQuantile("RdPu", domain = percent_white_bg$estimate_not_hispanic_or_latino_white_alone, n = 5)
leaflet(percent_white_bg %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(estimate_not_hispanic_or_latino_white_alone),
weight = 2,
opacity = 1,
color = "white",
fillOpacity = 0.7) %>%
addLegend(position = "bottomright",
pal = pal,
values = ~estimate_not_hispanic_or_latino_white_alone,
title = "Total white population")
But we know that some block groups have high margins of error. Mapped below are the coefficients of variation (cv), or the percent of the estimate covered by the margin of error.
percent_white_bg <- percent_white_bg %>%
mutate(cv = if_else(estimate_not_hispanic_or_latino_white_alone == 0, NA_real_, (margin_of_error_not_hispanic_or_latino_white_alone / 1.645) / estimate_not_hispanic_or_latino_white_alone))
pal <- colorQuantile("PuBu", domain = percent_white_bg$cv, na.color = "#edf0f2", n = 5)
leaflet(percent_white_bg %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(cv),
weight = 2,
opacity = 1,
color = "white",
fillOpacity = 0.7) %>%
addLegend(position = "bottomright",
pal = pal,
values = ~cv,
title = "Total white population coefficient of variation")
We can see below the most common cv is about 0.12, or 12% of the estimate for a particular block group. The general guideline is that estimates with a cv less than 0.12 are “highly reliable.” Only 30% of our block groups meet that threshold.
#sum(percent_white_bg$cv > 0.12, na.rm = TRUE) /nrow(percent_white_bg)
ggplot(percent_white_bg, aes(x = cv)) +
geom_histogram(fill = "#688da6", color = "white") +
scale_x_continuous(breaks = seq(0, 1, .1)) +
theme_minimal() +
labs(y = "Block groups", x = "Coefficient of variation (total white population)")
Instead, we can aggregate the block groups to some other geography that makes sense contextually. That context changes based on the input variables. In this case we only care about contiguous block groups that look similar based on their white population. We will aggregate the block groups until each new aggregate meets some minimum cv threshold. Initially I set it to 10%, so it could go higher in later iterations.
The aggregation algorithm created 230 new geography units. Notably, that is a better resolution than the 137 census tracts. We also are guaranteed some level of homogeneity, which is not necessarily true for tracts. Below are the new aggregations with the census tract boundaries overlayed. Fill color indicates the assigned region.
colors <- randomcoloR::distinctColorPalette(k = 230)
pal <- colorFactor(colors, domain = as.factor(aggregate$rids))
leaflet(aggregate %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(as.character(rids)),
weight = 2,
opacity = 1,
color = NA,
fillOpacity = 0.7,
stroke = FALSE) %>%
addPolygons(data = tract %>% st_transform(4326),
color = "black",
weight = 2,
fillColor = NA,
fillOpacity = 0)
We see considerable improvement in the margins of error, but there are some that still fall above the 10% threshold so I can experiment more with other variables and thresholds. About 53% are still above the 0.12 mark, but the extremes are much closer to the desired threshold.
aggregate_rids <- aggregate %>%
select(-ALAND, -AWATER) %>%
left_join(percent_white_bg %>% st_set_geometry(NULL)) %>%
group_by(rids) %>%
summarise(sumest = sum(estimate_not_hispanic_or_latino_white_alone),
moeest = tidycensus::moe_sum(margin_of_error_not_hispanic_or_latino_white_alone, estimate = estimate_not_hispanic_or_latino_white_alone)) %>%
filter(rids != -999) %>%
mutate(cv = (moeest / 1.645)/sumest)
#sum(aggregate_rids$cv > 0.12, na.rm = TRUE) /nrow(aggregate_rids)
ggplot(aggregate_rids, aes(x = cv)) +
geom_histogram(fill = "#688da6", color = "white") +
scale_x_continuous(breaks = seq(0, 1, .1)) +
theme_minimal() +
labs(y = "Aggregates/jolly tracts", x = "Coefficient of variation (total white population)")
pal <- colorQuantile("PuBu", domain = percent_white_bg$cv, na.color = "#edf0f2", n = 5)
leaflet(aggregate_rids %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(cv),
weight = 2,
opacity = 1,
color = "white",
fillOpacity = 0.7) %>%
addLegend(position = "bottomright",
pal = pal,
values = ~cv,
title = "Aggregated coefficient of variation")
# packages and data
library(tigris)
library(sf)
library(tidyverse)
library(randomcoloR)
library(leaflet)
# take out only met council region
counties_fips <- c("003", "053", "123", "163", "019", "139", "037")
# Ramsey county block groups
bg <- st_read("code/output/shp/tl_2017_27_bg.shp") %>%
st_transform(26915) %>%
filter(COUNTYFP %in% counties_fips)
# True census tracts
tract <- st_read("code/output/shp/tl_2017_27_tract.shp") %>%
st_transform(26915) %>%
filter(COUNTYFP %in% counties_fips)
# Jolly tracts
aggregate <- st_read("code/output/metc7cnty_4var.shp") %>%
st_transform(26915)
As an initial picture, there are 2,085 block groups in the seven county region.
leaflet(bg %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = "#faf7cf",
weight = 2,
opacity = 0.8,
color = "#b0b5bf",
fillOpacity = 0.6)
And 704 census tracts.
leaflet(tract %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = "#e89f9b",
weight = 2,
opacity = 0.8,
color = "#b0b5bf",
fillOpacity = 0.6)
We want to be able to subdivide the region into some smaller geographies that are contextually meaningful.
The variables I will use are percent white to illustrate demographics, over 65 and under 18 to illustrate family structure, and housing units built since 2000 to illustrate the housing stock. These are all going to be counted as proportions in the algorithm, rather than raw counts.
By mapping the covariates individually, we can see some patterns.
demographics <- read_csv("code/metc_data/target/target_estimates_metc.csv") %>% left_join(read_csv("code/metc_data/target/target_moe_metc.csv"))
demographics_bg <- bg %>% left_join(demographics %>% mutate(id = as.character(id)), by = c("GEOID" = "id")) %>%
mutate(percent_white = totalwhite/totalppl)
pal <- colorQuantile("RdPu", domain = demographics_bg$percent_white, n = 5)
leaflet(demographics_bg %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(percent_white),
weight = 2,
opacity = 1,
color = "white",
fillOpacity = 0.7) %>%
addLegend(position = "bottomright",
pal = pal,
values = ~percent_white,
title = "Percent white population")
demographics_bg$percent_over_65 <- demographics_bg$over65 / demographics_bg$total65
pal <- colorQuantile("OrRd", domain = demographics_bg$percent_over_65, n = 5)
leaflet(demographics_bg %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(percent_over_65),
weight = 2,
opacity = 1,
color = "white",
fillOpacity = 0.7) %>%
addLegend(position = "bottomright",
pal = pal,
values = ~percent_over_65,
title = "Percent over 65 years old")
demographics_bg$hu_since_2000 <- demographics_bg$built_after_2000 / demographics_bg$totalhu
pal <- colorQuantile("PuRd", domain = demographics_bg$hu_since_2000, n = 4)
leaflet(demographics_bg %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(hu_since_2000),
weight = 2,
opacity = 1,
color = "white",
fillOpacity = 0.7) %>%
addLegend(position = "bottomright",
pal = pal,
values = ~hu_since_2000,
title = "Housing stock built since 2000")
The regionalization algorithm generated 167 aggregate regions. This is lower than I had expected.
colors <- randomcoloR::distinctColorPalette(k = 230)
pal <- colorFactor(colors, domain = as.factor(aggregate$rids))
leaflet(aggregate %>% st_transform(4326)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(as.character(rids)),
weight = 2,
opacity = 1,
color = NA,
fillOpacity = 0.7,
stroke = FALSE) %>%
addPolygons(data = tract %>% st_transform(4326),
color = "black",
weight = 2,
fillColor = NA,
fillOpacity = 0)